0. 데이터 로드 dwaine

# 데이터 읽어오기
# url이거나 txt파일인 경우 -> read.table
# csv 파일인 경우 -> read.csv
# 엑셀 파일인 경우 -> read.excel

dwaine<-read.table(
  "http://www.stat.ufl.edu/~rrandles/sta4210/Rclassnotes/data/textdatasets/KutnerData/Chapter%20%206%20Data%20Sets/CH06FI05.txt")


# 컬럼 지정
names(dwaine)<-c("X1","X2","Y")

# 데이터 확인
# 일부
head(dwaine)
##     X1   X2     Y
## 1 68.5 16.7 174.4
## 2 45.2 16.8 164.4
## 3 91.3 18.2 244.2
## 4 47.8 16.3 154.6
## 5 46.9 17.3 181.6
## 6 66.1 18.2 207.5
# 차원
dim(dwaine)
## [1] 21  3
# 요약
summary(dwaine)
##        X1              X2              Y        
##  Min.   :38.40   Min.   :15.80   Min.   :137.2  
##  1st Qu.:47.80   1st Qu.:16.30   1st Qu.:152.8  
##  Median :52.30   Median :17.10   Median :166.5  
##  Mean   :62.02   Mean   :17.14   Mean   :181.9  
##  3rd Qu.:82.70   3rd Qu.:18.10   3rd Qu.:209.7  
##  Max.   :91.30   Max.   :19.10   Max.   :244.2

1. 시각화 pairs, pairs2, 3d

pairs(dwaine)

library(ggplot2)
library(GGally)
ggpairs(dwaine)

library(dplyr)
library(plotly)
p <- plot_ly(dwaine, x = ~X1, y = ~X2, z = ~Y,
             colors = c('#BF382A')) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'X1'),
                      yaxis = list(title = 'X2'),
                      zaxis = list(title = 'Y')))
p

2. 선형 회귀 모델 추정

2-1. 회귀 계수(bhat) 계산

Y<-dwaine$Y
X<-cbind(1,dwaine$X1, dwaine$X2)
bhat<-solve(t(X)%*%X)%*%t(X)%*%Y
bhat
##           [,1]
## [1,] -68.85707
## [2,]   1.45456
## [3,]   9.36550

2-2. lm 함수로 회귀 모델 추정

lm.dwaine<- lm(Y~X1+X2,data=dwaine)
summary(lm.dwaine)
## 
## Call:
## lm(formula = Y ~ X1 + X2, data = dwaine)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.4239  -6.2161   0.7449   9.4356  20.2151 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -68.8571    60.0170  -1.147   0.2663    
## X1            1.4546     0.2118   6.868    2e-06 ***
## X2            9.3655     4.0640   2.305   0.0333 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.01 on 18 degrees of freedom
## Multiple R-squared:  0.9167, Adjusted R-squared:  0.9075 
## F-statistic:  99.1 on 2 and 18 DF,  p-value: 1.921e-10

2-3. 예측값(yhat)계산

Yhat<-fitted(lm.dwaine)
Yhat.1<-X%*% bhat

H<-X%*%solve(t(X)%*%X)%*%t(X)
Yhat.2<-H%*%Y

head(cbind(Yhat, Yhat.1, Yhat.2))
##       Yhat                  
## 1 187.1841 187.1841 187.1841
## 2 154.2294 154.2294 154.2294
## 3 234.3963 234.3963 234.3963
## 4 153.3285 153.3285 153.3285
## 5 161.3849 161.3849 161.3849
## 6 197.7414 197.7414 197.7414
plot(Yhat~Y)

plot(Yhat ~ Y,
     xlab = "Observed Y", 
     ylab = "Predicted Y", 
     pch = 19, 
     xlim = c(130, 250),
     ylim = c(130, 250),
     col = "gray30",
     cex = 1.5)

abline(0, 1, col = "steelblue", lwd = 1)
grid(nx = NULL, ny = NULL, col = "lightgray", lty = "dashed")

2-4. 잔차(예측값(yhat)과 실제값(y) 차이) 계산

Y-Yhat
##           1           2           3           4           5           6 
## -12.7841146  10.1705737   9.8036764   1.2714690  20.2150722   9.7585779 
##           7           8           9          10          11          12 
##   0.7449178  -4.6666316 -12.3381967   0.3539791  11.5126289  -6.0849209 
##          13          14          15          16          17          18 
##   9.3142995   3.7815611 -13.1132116 -18.4238900   0.6530062 -15.0013134 
##          19          20          21 
##   1.6129777  -6.2160615   9.4356009
residuals(lm.dwaine)
##           1           2           3           4           5           6 
## -12.7841146  10.1705737   9.8036764   1.2714690  20.2150722   9.7585779 
##           7           8           9          10          11          12 
##   0.7449178  -4.6666316 -12.3381967   0.3539791  11.5126289  -6.0849209 
##          13          14          15          16          17          18 
##   9.3142995   3.7815611 -13.1132116 -18.4238900   0.6530062 -15.0013134 
##          19          20          21 
##   1.6129777  -6.2160615   9.4356009

2-5. 잔차(residuals) 시각화

library(ggfortify)
autoplot(lm.dwaine)

2-6. 분산분석(ANOVA)

anova(lm.dwaine)
## Analysis of Variance Table
## 
## Response: Y
##           Df  Sum Sq Mean Sq  F value   Pr(>F)    
## X1         1 23371.8 23371.8 192.8962 4.64e-11 ***
## X2         1   643.5   643.5   5.3108  0.03332 *  
## Residuals 18  2180.9   121.2                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

2-7. 회귀 모델 요약

summary(lm.dwaine)
## 
## Call:
## lm(formula = Y ~ X1 + X2, data = dwaine)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.4239  -6.2161   0.7449   9.4356  20.2151 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -68.8571    60.0170  -1.147   0.2663    
## X1            1.4546     0.2118   6.868    2e-06 ***
## X2            9.3655     4.0640   2.305   0.0333 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.01 on 18 degrees of freedom
## Multiple R-squared:  0.9167, Adjusted R-squared:  0.9075 
## F-statistic:  99.1 on 2 and 18 DF,  p-value: 1.921e-10

3. 예측 구간과 신뢰 구간 계산

3-1. Confidence Interval for Mean Response

Define the vector \(X_h\):

\[ X_h = \begin{bmatrix} 1 \\ X_{h,1} \\ \vdots \\ X_{h,p-1} \end{bmatrix}, \quad E\{Y_h\} = X_h^T \beta, \quad \hat{Y}_h = X_h^T b \]

Variance of \(\hat{Y}_h\):

\[ \sigma^2\{\hat{Y}_h\} = \sigma^2 X_h^T (X^T X)^{-1} X_h \]

Estimated variance:

\[ s^2\{\hat{Y}_h\} = MSE \, (X_h^T (X^T X)^{-1} X_h) = X_h^T s^2\{b\} X_h \]

The \((1-\alpha)\) confidence limits for \(E\{Y_h\}\) are:

\[ \hat{Y}_h \; \pm \; t(1-\alpha/2; n-p) \; s\{\hat{Y}_h\} \]

head(predict(lm.dwaine, newdata=dwaine, interval = "confidence"))
##        fit      lwr      upr
## 1 187.1841 179.1146 195.2536
## 2 154.2294 146.7591 161.6998
## 3 234.3963 224.7569 244.0358
## 4 153.3285 146.5361 160.1210
## 5 161.3849 152.0778 170.6921
## 6 197.7414 188.5424 206.9404
predict.lm(lm.dwaine, newdata=data.frame(X1=65.4, X2=17.6), interval="confidence", level=0.95)
##        fit      lwr      upr
## 1 191.1039 185.2911 196.9168

3-2. Prediction Interval for a New Observation

The \((1 - \alpha)\) prediction limits for a new observation \(Y_{h}^{(new)}\) corresponding to \(X_h\), the specified values of the \(X\) variables, are:

\[ \hat{Y}_h \; \pm \; t(1-\alpha/2; \, n-p) \, s\{pred\} \]

where:

\[ s^2\{pred\} = MSE + s^2\{\hat{Y}_h\} = MSE \Big( 1 + X_h^T (X^T X)^{-1} X_h \Big) \]

head(predict(lm.dwaine, newdata=dwaine, interval = "prediction"))
##        fit      lwr      upr
## 1 187.1841 162.6910 211.6772
## 2 154.2294 129.9271 178.5317
## 3 234.3963 209.3421 259.4506
## 4 153.3285 129.2260 177.4311
## 5 161.3849 136.4566 186.3132
## 6 197.7414 172.8533 222.6295
predict.lm(lm.dwaine, newdata=data.frame(X1=65.4, X2=17.6), interval="prediction", level=0.95)
##        fit      lwr     upr
## 1 191.1039 167.2589 214.949

The End